library(tidyverse)
library(ggplot2)
library(ggpubr)
library(plotly)
library(data.table)
library(rstatix)
library(countrycode)
library(car)

Part I

Question 1

data <- data.table(read.csv("gapminder_clean.csv")) # used read.csv to leverage ggpubr plotting
data <- data %>% mutate(continent = countrycode(
  sourcevar = Country.Name,
  origin = "country.name",
  destination = "continent"
))
data[is.na(continent), continent := "Unknown"]

Question 2 & 3

To determine the relationship between normalized CO2 emissions and GDP per capita in 1962 we’ll use Pearson’s correlation. We’ll do this by setting cor.coef=TRUE in ggscatter (default method is Pearson’s correlation).

data %>%
  filter(Year == 1962) %>%
  ggscatter(
    x = grep("emissions", colnames(data), value = TRUE),
    y = "gdpPercap",
    add = "reg.line",
    cor.coef = TRUE
  ) + 
  labs(
    x = expression(paste(CO[2], " emissions (metric tons per capita)")),
    y = "GDP per Capita",
    title = expression(paste("Correlation between ", CO[2], " emissions and GDP per capita"))
  )

Interpretation: As you can see from the Pearson correlation coefficient (R) normalized CO2 emissions and GDP per capita in 1962 were highly correlated

Question 4

To determine what year normalized CO2 emissions and GDP per capita were the most highly correlated we’ll group by year, then correlate using Pearson’s correlation (cor’s default method)

maxCorrYear <- data %>%
  group_by(Year) %>%
  mutate(
    correlation = cor(
      CO2.emissions..metric.tons.per.capita.,
      gdpPercap,
      use = "complete.obs"
    )
  ) %>%
  ungroup() %>%
  filter(correlation == max(correlation, na.rm = TRUE))

The highest correlation between normalized CO2 emissions and GDP per capita is 1967

Question 5

p <- maxCorrYear %>%
  ggscatter(
    x = grep("emissions", colnames(data), value = TRUE),
    y = "gdpPercap",
    size = "pop",
    color = "continent"
  ) +
  labs(
    x = "CO<sub>2</sub> emissions (metric tons per capita)",
    y = "GDP per Capita",
    title = "Correlation between CO<sub>2</sub> emissions and GDP per capita",
    size = NULL,
    color = "Continent"
  )

ggplotly(p)

Part II

Question 1

We will complete the following steps to determine the relationship between continent and normalized energy usage:

  1. Calculate the mean of each country’s normalized energy usage across the given time period (1962 - 2007)
  2. Ensure each country has an assigned continent
  3. Test our data for normality using the Shapiro-Wilks test
  4. Test our data for the equal variance assumption using Levene’s test
  5. Depending on our test for normality and equal variance, apply the appropriate statistic to compare energy usage by continent
  6. Plot results

2.1.1 - 2.1.2

country_energy_use <- data %>%
  group_by(Country.Name) %>%
  reframe(
    mean_energy_use = na.omit(
      mean(
        Energy.use..kg.of.oil.equivalent.per.capita.,
        na.rm = TRUE
      )
    )
  )

energy_use <- merge(
  country_energy_use,
  data[!duplicated(Country.Name), c("Country.Name", "continent"), with = FALSE]
)

2.1.3

normality <- energy_use %>%
  group_by(continent) %>%
  summarise(
    statistic = shapiro.test(mean_energy_use)$statistic,
    p_value = shapiro.test(mean_energy_use)$p.value
  )

normality_tbl <- normality %>%
  ggtexttable(
    rows = NULL,
    theme = ttheme("blank")
  ) %>%
  tab_add_hline(at.row = c(1, 2), row.side = "top", linewidth = 2) %>%
  tab_add_hline(at.row = nrow(normality) + 1, row.side = "bottom", linewidth = 2)

normality_tbl

Interpretation: The energy usage within each continent deviates significantly from a normal distribution (p-values < 0.05)

2.1.4

variance <- leveneTest(mean_energy_use ~ continent, data = energy_use)

Levene’s statistic: 1.6483968, Levene’s p-value: 0.1485942

Interpretation: The p-value is > 0.05 meaning the variances in energy usage across each continent are not significantly different

2.1.5

Since our dataset is non-normal but has equal variance we should deploy a non-parametric test that does not rely on the assumption of normality, like the Wilcoxon test. The Wilcoxon test will allow us to perform a pairwise comparison between the energy usage of each continent.

pairwise_result <- pairwise_wilcox_test(
  data = energy_use,
  formula = mean_energy_use ~ continent,
  p.adjust.method = "none"
) %>%
  rename("p.signif" = "p.adj.signif") %>%
  select(group1, group2, p.signif)

tbl <- pairwise_result %>%
  ggtexttable(
    rows = NULL,
    theme = ttheme("blank")
  ) %>%
  tab_add_hline(at.row = c(1, 2), row.side = "top", linewidth = 2) %>%
  tab_add_hline(at.row = nrow(pairwise_result) + 1, row.side = "bottom", linewidth = 2)

2.1.6

Order the continents by median energy use

x_axis_order <- energy_use %>%
  group_by(continent) %>%
  summarize(
    median = median(mean_energy_use, na.rm = TRUE)
  ) %>%
  arrange(median) %>%
  pull(continent)

Generate plot

p <- energy_use %>%
  ggboxplot(
    x = "continent",
    y = "mean_energy_use",
    color = "continent",
    add = "jitter",
    order = x_axis_order,
    outlier.shape = NA,
  ) +
  labs(
    title = "Energy Use (kg oil equivalent per capita) by Continent",
    x = NULL,
    y = "Mean energy use (kg of oil equivalent per capita)"
  )

p <- ggpar(p, legend = "none")
ggarrange(p, tbl, ncol = 2, nrow = 1)

Interpretation: The figure above indicates that Europe has the highest per capita energy usage with Africa and Oceania tied for the lowest per capita energy usage. The energy usage of the Americas, Asia, and entities not associated with a continent fall somewhere in between.

Question 2

We will complete the following steps to determine if there a significant difference between Europe and Asia with respect to imports in the years after 1990:

  1. Calculate the mean of each countries import percentage after 1990
  2. Test our data for normality using the Shapiro-Wilks test
  3. Test our data for the equal variance assumption using Levene’s test
  4. Depending on our test for normality and equal variance, apply the appropriate statistic to compare import percentage
  5. Plot results

2.2.1

country_percent_imports <- data %>%
  filter(continent %in% c("Europe", "Asia") & Year >= 1990) %>%
  group_by(Country.Name) %>%
  reframe(
    mean_import_perc = na.omit(
      mean(
        Imports.of.goods.and.services....of.GDP.,
        na.rm = TRUE
      )
    )
  )

percent_imports <- merge(
  country_percent_imports,
  data[!duplicated(Country.Name), c("Country.Name", "continent"), with = FALSE]
)

2.2.2

normality <- percent_imports %>%
  group_by(continent) %>%
  summarise(
    statistic = shapiro.test(mean_import_perc)$statistic,
    p_value = shapiro.test(mean_import_perc)$p.value
  )

normality_tbl <- normality %>%
  ggtexttable(
    rows = NULL,
    theme = ttheme("blank")
  ) %>%
  tab_add_hline(at.row = c(1, 2), row.side = "top", linewidth = 2) %>%
  tab_add_hline(at.row = nrow(normality) + 1, row.side = "bottom", linewidth = 2)

normality_tbl

Interpretation: Import percentage for each continent deviates significantly from a normal distribution (p-values < 0.05)

2.2.3

variance <- leveneTest(mean_import_perc ~ continent, data = percent_imports)

Levene’s statistic: 1.1410734, Levene’s p-value: 0.2883807

Interpretation: The p-value is > 0.05 meaning the variances in import percentage across each continent are not significantly different

2.2.3

Since our dataset is non-normal but has equal variance we should deploy a non-parametric test that does not rely on the assumption of normality, like the Wilcoxon test. stat_compare_means will automatically perform this calculation and plot the result on our boxplot.

percent_imports %>%
  ggboxplot(
    x = "continent",
    y = "mean_import_perc",
    add = "jitter",
    xlab = FALSE,
    ylab = "Mean import percentage of goods and services",
    title = "Import percentage of goods and services between Asia and Europe post 1990",
    outlier.shape = NA
  ) +
  stat_compare_means()

As you can see from the Wilcoxon p-value (> 0.05), there is no significant difference between Europe and Asia with respect to the import percentage in the years after 1990

Question 3

country_highest_pop <- data %>%
  group_by(Country.Name) %>%
  summarize(
    mean = mean(Population.density..people.per.sq..km.of.land.area., na.rm = TRUE)
  ) %>%
  filter(mean == max(mean, na.rm = TRUE)) %>%
  pull(Country.Name)

The countries with the highest population density per sq km of land are Macao SAR and China

Question 4

country_highest_life <- data %>%
  filter(Year >= 1962 & Year <= 2007) %>%
  select(Country.Name, Year, Life.expectancy.at.birth..total..years.) %>%
  spread(Year, Life.expectancy.at.birth..total..years.) %>%
  mutate(life_expectancy_increase = `2007` - `1962`) %>%
  filter(life_expectancy_increase == max(life_expectancy_increase, na.rm = TRUE)) %>%
  pull(Country.Name)

The country with the greatest increase in life expectancy at birth between 1962 and 2007 is the Maldives